---
title: "Conditional Probability"
author: "Hannah S.K. Pahama"
date: "February 2025"
output: html_document
---

Abstract

This file is about Conditional Probability.

format: 
  html:
    embed-resources: true

1. Setup

Let’s start by setting up the code and loading the necessary libraries:

# Install necessary packages (only if not already installed)
packages <- c("tidyverse", 
              "caret", 
              "knitr", 
              "kableExtra", 
              "tidytext", 
              "naivebayes",
              "dplyr",
              "plotly")

new_packages <- packages[!(packages %in% installed.packages()[, "Package"])]
if (length(new_packages)) install.packages(new_packages)

suppressPackageStartupMessages({
  library(tidyverse)
  library(caret)
  library(knitr)      # For pretty tables
  library(kableExtra) # For extra table styling
  library(tidytext)
  library(naivebayes)
  library(dplyr)
  library(plotly)
})

wine <- readRDS(gzcon(url("https://github.com/cd-public/D505/raw/master/dat/pinot.rds")))

2. Conditional Probability

We’re interested in calculating the probability that a Pinot comes from Burgundy, given that it contains the word ‘fruit’ in the description.

Solution:

First, we’ll filter the data to find wines that contain the word “fruit” in their description, and then calculate the conditional probability.

wine %>% 
  filter(str_detect(description, "fruit")) %>% 
  summarise(prop = mean(province == "Burgundy")) %>% 
  pull(prop)
## [1] 0.2196038

3. Naive Bayes Algorithm

We’ll train a Naive Bayes model to classify a wine’s province using: - An 80-20 train-test split. - Three features engineered from the wine description. - 5-fold cross-validation.

Afterward, we’ll report the Kappa value after using the model to predict provinces in the holdout sample.

Solution:

# Preprocess the data
wino <- wine %>% 
  mutate(
    cherry = str_detect(description, "cherry"),
    chocolate = str_detect(description, "chocolate"),
    earth = str_detect(description, "earth")
  ) %>%
  select(-description)

# Split into train and test sets
wine_index <- createDataPartition(wino$province,
                                  p = 0.80,
                                  list = FALSE)
train <- wino[wine_index, ]
test <- wino[-wine_index, ]

# Train Naive Bayes model
fit <- train(
  province ~ .,
  data = train, 
  method = "naive_bayes",
  metric = "Kappa",
  trControl = trainControl(method = "cv", number = 5)
)

# Generate confusion matrix
conf_mat <- confusionMatrix(predict(fit, test), factor(test$province))

# Convert to a tidy data frame
conf_table <- as.data.frame(conf_mat$table)

# Print confusion matrix as a pretty table
kable(conf_table, caption = "Confusion Matrix") %>%
  kable_styling(bootstrap_options = c("striped", 
                                      "hover", 
                                      "condensed"))
Confusion Matrix
Prediction Reference Freq
Burgundy Burgundy 133
California Burgundy 86
Casablanca_Valley Burgundy 2
Marlborough Burgundy 0
New_York Burgundy 3
Oregon Burgundy 14
Burgundy California 62
California California 691
Casablanca_Valley California 1
Marlborough California 1
New_York California 10
Oregon California 26
Burgundy Casablanca_Valley 9
California Casablanca_Valley 9
Casablanca_Valley Casablanca_Valley 1
Marlborough Casablanca_Valley 1
New_York Casablanca_Valley 2
Oregon Casablanca_Valley 4
Burgundy Marlborough 7
California Marlborough 19
Casablanca_Valley Marlborough 0
Marlborough Marlborough 2
New_York Marlborough 10
Oregon Marlborough 7
Burgundy New_York 3
California New_York 15
Casablanca_Valley New_York 0
Marlborough New_York 0
New_York New_York 6
Oregon New_York 2
Burgundy Oregon 88
California Oregon 293
Casablanca_Valley Oregon 1
Marlborough Oregon 2
New_York Oregon 14
Oregon Oregon 149

4. Frequency Differences

We aim to find the three words that most distinguish New York Pinots from all other Pinots.

Solution:

# Step 1: Tokenize and count words in New York Pinots and other Pinots
ny_word_count <- wine %>%
  filter(province == "New_York") %>%
  unnest_tokens(word, description) %>%
  anti_join(stop_words, by = "word") %>%
  count(word)

other_word_count <- wine %>%
  filter(province != "New_York") %>%
  unnest_tokens(word, description) %>%
  anti_join(stop_words, by = "word") %>%
  count(word)

# Step 2: Combine word counts and calculate the difference
word_diff <- full_join(ny_word_count,
                       other_word_count, by = "word", 
                       suffix = c("_ny", "_other")) %>%
  replace_na(list(n = 0)) %>%
  mutate(diff = n_ny - n_other) %>%
  arrange(desc(abs(diff)))

# Step 3: Get the top 3 words with the largest difference
top_3_words <- word_diff %>%
  top_n(3, abs(diff)) %>%
  select(word, diff)

top_3_words
##     word  diff
## 1   wine -5680
## 2  fruit -4092
## 3 cherry -3436
# Visualization
library(plotly)
library(dplyr)

# Prepare data for plotting
top_3_words <- top_3_words %>%
  mutate(x = seq_along(word),
         y = diff,
         z = rep(0, n()))

# Create the 3D scatter plot
fig <- plot_ly(top_3_words, 
              x = ~x, 
              y = ~y, 
              z = ~z, 
              type = 'scatter3d', 
              mode = 'markers+text', 
              text = ~word,
              marker = list(size = 10)) %>%
  layout(scene = list(
    xaxis = list(title = 'Words'),
    yaxis = list(title = 'Difference (NY vs Other)'),
    zaxis = list(title = 'Z')
  ))

# Show plot
fig

5. Extension (Optional)

Bonus Problem:

We’ll calculate the variance of the logged word-frequency distributions for each province.

# Create a word frequency table for each province
word_freq_by_province <- wine %>%
  unnest_tokens(word, description) %>%
  count(province, word) %>%
  group_by(province) %>%
  mutate(log_freq = log(n))

# Calculate the variance of log word frequencies for each province
word_freq_variance <- word_freq_by_province %>%
  group_by(province) %>%
  summarise(variance = var(log_freq))

word_freq_variance
## # A tibble: 6 × 2
##   province          variance
##   <chr>                <dbl>
## 1 Burgundy              2.18
## 2 California            2.18
## 3 Casablanca_Valley     1.08
## 4 Marlborough           1.19
## 5 New_York              1.13
## 6 Oregon                2.08
  • …. . / — -. .-.. -.– / .– .- -.– / - — / -.. — / –. .-. . .- - / .– — .-. -.- / .. … / - — / .-.. — …- . / .– …. .- - / -.– — ..- / -.. — .-.-.-